require(quanteda)
require(LSX)
source("functions.R")

toks <- readRDS("data_tokens_ja.RDS") %>% 
    tokens_remove('\\d', valuetype = 'regex', min_nchar = 2) %>% 
    tokens_remove(newsmap::data_dictionary_newsmap_ja)

# NOTE: disabled because this function changed its behavior
# nuke <- char_keyness(toks, dict_ja$targets, window = 10, min_count = 5)

dfmt <- dfm(toks, remove_padding = TRUE) %>% 
    dfm_trim(min_termfreq = 10)

seed <- as.seedwords(dict_ja$seeds, upper = 1, lower = 2)
#lss <- textmodel_lss(dfmt, seed, nuke, cache = TRUE) # NOTE: when char_keyness was used
lss <- textmodel_lss(dfmt, seed, cache = TRUE)
saveRDS(lss, "lss_ja.RDS")

dfmt_group <- dfm_group(dfmt)
dat <- docvars(dfmt_group)
pred <- predict(lss, newdat = dfmt_group, density = TRUE)

dat$lss <- pred$fit
dat$density <- pred$density
dat$lss[dat$density < quantile(dat$density, 0.25)] <- NA
dat$lss <- as.numeric(scale(dat$lss))

saveRDS(dat, "data_lss_ja.RDS")

